// Copyright (C) 2022 Wu Zhangjin <falcon@ruma.tech>, All Rights Reserved.
//
// Gcc Inline Assembly: https://www.ibiblio.org/gferg/ldp/GCC-Inline-Assembly-HOWTO.html
//                      https://www.cristal.univ-lille.fr/~marquet/ens/ctx/doc/l-ia.html
//                      https://wiki.osdev.org/Inline_assembly

// X86_64 ISA:          https://www.aldeid.com/wiki/X86-assembly/Instructions

#include "benchmark/benchmark.h"

#define OPTIMIZE_LEVEL 1

#if defined(OPTIMIZE_LEVEL) && (OPTIMIZE_LEVEL == 0)
#define benchmark_DoNotOptimize() benchmark::DoNotOptimize(state.iterations());
#else
#define benchmark_DoNotOptimize() do { } while(0)
#endif

volatile int enabled;

void BM_nop(benchmark::State& state) {
    for (auto _ : state) {
        benchmark_DoNotOptimize();
        asm volatile ("nop":::"memory");
    }
}
BENCHMARK(BM_nop);
// BENCHMARK(BM_nop)->ThreadPerCpu();

void BM_ub(benchmark::State& state) {
    for (auto _ : state) {
        benchmark_DoNotOptimize();
        asm volatile (
            "1: b 2f \n"
            "2:"
            :::"memory");
    }
}
BENCHMARK(BM_ub);
// BENCHMARK(BM_ub)->ThreadPerCpu();

void BM_bnez(benchmark::State& state) {
    register int x = 1;

    for (auto _ : state) {
        benchmark_DoNotOptimize();
        asm volatile (
            "1: cbnz %0, 2f \n"
            "2:"
            :
            :"r" (x)
            :"memory");
    }
}
BENCHMARK(BM_bnez);
// BENCHMARK(BM_bnez)->ThreadPerCpu();

void BM_beqz(benchmark::State& state) {
    register int x = 0;

    for (auto _ : state) {
        benchmark_DoNotOptimize();
        asm volatile (
            "1: cbz %0, 2f \n"
            "2:"
            :
            :"r" (x)
            :"memory");
    }
}
BENCHMARK(BM_beqz);
// BENCHMARK(BM_beqz)->ThreadPerCpu();

void BM_load_bnez(benchmark::State& state) {
    enabled = 1;

    for (auto _ : state) {
        benchmark_DoNotOptimize();
        if (enabled != 0)
            asm volatile ("":::"memory");
    }
}
BENCHMARK(BM_load_bnez);
// BENCHMARK(BM_load_bnez)->ThreadPerCpu();

void BM_load_beqz(benchmark::State& state) {
    enabled = 0;

    for (auto _ : state) {
        benchmark_DoNotOptimize();
        if (enabled == 0)
            asm volatile ("":::"memory");
    }
}
BENCHMARK(BM_load_beqz);
// BENCHMARK(BM_load_beqz)->ThreadPerCpu();

volatile int thread_start;
volatile int thread_exit;
struct th_data {
    int enabled;
    int type;
};
volatile struct th_data tdata;

enum {
    CACHE_MISS = 0,
    CACHE_BRANCH_MISS = 1,
    BRANCH_MISS = 2,
    NO_MISS = 3,
};

static void *thread_handler (void *data)
{
    volatile int *ptr = &enabled;
    struct th_data *td = (struct th_data *)data;
    long i = td->enabled;

    thread_start = 1;

    while (!thread_exit) {
        switch (td->type) {
            case CACHE_MISS:
                *ptr = i;
                break;
            case BRANCH_MISS:
                *ptr = 1 - i;
                break;
            case CACHE_BRANCH_MISS:
                i = 1 - i;
                *ptr = i;
                break;
            default:
                break;
        }
    }

    return NULL;
}

void BM_cache_miss_load_bnez(benchmark::State& state) {
    pthread_t th;

    enabled = 1;
    thread_start = 0;
    thread_exit = 0;

    tdata.enabled = enabled;
    tdata.type = CACHE_MISS;

    pthread_create (&th, NULL, thread_handler, (void *)&tdata);

    while (!thread_start);

    for (auto _ : state) {
        benchmark_DoNotOptimize();
        if (enabled != 0)
            asm volatile ("":::"memory");
    }

    thread_exit = 1;
    pthread_join (th, NULL);
}
BENCHMARK(BM_cache_miss_load_bnez);
//BENCHMARK(BM_cache_miss_load_bnez)->ThreadRange(1,3);
//BENCHMARK(BM_cache_miss_load_bnez)->ThreadPerCpu();

void BM_cache_miss_load_beqz(benchmark::State& state) {
    pthread_t th;
    long i;

    enabled = 0;
    thread_start = 0;
    thread_exit = 0;

    tdata.enabled = enabled;
    tdata.type = CACHE_MISS;

    pthread_create (&th, NULL, thread_handler, (void *)&tdata);

    while (!thread_start);

    for (auto _ : state) {
        benchmark_DoNotOptimize();
        if (enabled == 0)
            asm volatile ("":::"memory");
    }

    thread_exit = 1;
    pthread_join (th, NULL);
}
BENCHMARK(BM_cache_miss_load_beqz);
//BENCHMARK(BM_cache_miss_load_beqz)->ThreadRange(1,3);
//BENCHMARK(BM_cache_miss_load_beqz)->ThreadPerCpu();

void BM_branch_miss_load_bnez(benchmark::State& state) {
    pthread_t th;

    enabled = 1;
    thread_start = 0;
    thread_exit = 0;

    tdata.enabled = enabled;
    tdata.type = BRANCH_MISS;

    pthread_create (&th, NULL, thread_handler, (void *)&tdata);

    while (!thread_start);

    for (auto _ : state) {
        benchmark_DoNotOptimize();
        if (enabled != 0)
            asm volatile ("":::"memory");
    }

    thread_exit = 1;
    pthread_join (th, NULL);
}
BENCHMARK(BM_branch_miss_load_bnez);
//BENCHMARK(BM_branch_miss_load_bnez)->ThreadRange(1,3);
//BENCHMARK(BM_branch_miss_load_bnez)->ThreadPerCpu();

void BM_branch_miss_load_beqz(benchmark::State& state) {
    pthread_t th;
    long i;

    enabled = 0;
    thread_start = 0;
    thread_exit = 0;

    tdata.enabled = enabled;
    tdata.type = BRANCH_MISS;

    pthread_create (&th, NULL, thread_handler, (void *)&tdata);

    while (!thread_start);

    for (auto _ : state) {
        benchmark_DoNotOptimize();
        if (enabled == 0)
            asm volatile ("":::"memory");
    }

    thread_exit = 1;
    pthread_join (th, NULL);
}
BENCHMARK(BM_branch_miss_load_beqz);
//BENCHMARK(BM_branch_miss_load_beqz)->ThreadRange(1,3);
//BENCHMARK(BM_branch_miss_load_beqz)->ThreadPerCpu();

void BM_cache_branch_miss_load_bnez(benchmark::State& state) {
    pthread_t th;

    enabled = 1;
    thread_start = 0;
    thread_exit = 0;

    tdata.enabled = enabled;
    tdata.type = CACHE_BRANCH_MISS;

    pthread_create (&th, NULL, thread_handler, (void *)&tdata);

    while (!thread_start);

    for (auto _ : state) {
        benchmark_DoNotOptimize();
        if (enabled != 0)
            asm volatile ("":::"memory");
    }

    thread_exit = 1;
    pthread_join (th, NULL);
}
BENCHMARK(BM_cache_branch_miss_load_bnez);
//BENCHMARK(BM_cache_branch_miss_load_bnez)->ThreadRange(1,3);
//BENCHMARK(BM_cache_branch_miss_load_bnez)->ThreadPerCpu();

void BM_cache_branch_miss_load_beqz(benchmark::State& state) {
    pthread_t th;
    long i;

    enabled = 0;
    thread_start = 0;
    thread_exit = 0;

    tdata.enabled = enabled;
    tdata.type = CACHE_BRANCH_MISS;

    pthread_create (&th, NULL, thread_handler, (void *)&tdata);

    while (!thread_start);

    for (auto _ : state) {
        benchmark_DoNotOptimize();
        if (enabled == 0)
            asm volatile ("":::"memory");
    }

    thread_exit = 1;
    pthread_join (th, NULL);
}
BENCHMARK(BM_cache_branch_miss_load_beqz);
//BENCHMARK(BM_cache_branch_miss_load_beqz)->ThreadRange(1,3);
//BENCHMARK(BM_cache_branch_miss_load_beqz)->ThreadPerCpu();


BENCHMARK_MAIN();
